In [10]:
from Bio import AlignIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Graphics import GenomeDiagram
from reportlab.lib import colors
from reportlab.lib.units import cm
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import sys, os
import itertools

%matplotlib inline

################### Function Definitions ######################
def get_diff_locations(ref,compare):
    """
    Takes 2 Biopython Seq objects of equal size (aligned).
    Returns a list of locations at which base in compare sequence != base in ref sequence.
    """
    if len(ref) != len(compare):
        raise ValueError('Seqs are not the same length!')

    diff_locations = []
    for n,(r,c) in enumerate(itertools.izip(ref, compare)):
        if r != c:
            diff_locations.append(n)

    return diff_locations

def plot_tracks(tracks,tracks_path):
    '''
    Save tracks as pngs, then load them as images and display with matplotlib.
    '''
    
    #Write each diagram to a png, then read it back in using Matplotlib
    tracks.draw(format='linear', pagesize=(14*cm,7*cm), fragments=1,
             start=0, end=13378)
    plt.show()
    #get extension from tracks path
    root, ext = os.path.splitext(tracks_path)
    if not ext:
        ext = '.png'
        tracks_path += ext

    tracks.write(tracks_path, ext.strip('.'), dpi=600)
    tracks_im = mpimg.imread(tracks_path)

    #Plot each set of tracks
    fig = plt.figure(figsize=(14,7),dpi=600)
    ax = fig.add_axes([0.025,0.025,0.95,0.95],frameon=False)
    plt.axis('off')
    plt.imshow(tracks_im)
    
################### End Function Definitions ######################

In [11]:
#Import and sort the aligned sequences
all_aln = AlignIO.read('all_seq.aln','clustal')
all_aln.sort()

In [12]:
#collect ids and separate based on host (letter prefex preceding the year)
all_ids = [i.id for i in all_aln]
h_ids = all_ids[0:4]
s_ids = all_ids[4:]
print all_ids,'\n', h_ids, s_ids


['h1935', 'h1978', 'h2009', 'h2014', 's1935', 's1978', 's2009', 's2014'] 
['h1935', 'h1978', 'h2009', 'h2014'] ['s1935', 's1978', 's2009', 's2014']

Get Differences

Compare sequence from each year against the same-host sequence from 1935. Each time a difference is found, save the index at which it occurs.


In [13]:
#Compare each year to 1935
h_diffs = {}
s_diffs = {}
h_diffs['h1978'] = get_diff_locations(all_aln[0].seq,all_aln[1].seq)
h_diffs['h2009'] = get_diff_locations(all_aln[0].seq,all_aln[2].seq)
h_diffs['h2014'] = get_diff_locations(all_aln[0].seq,all_aln[3].seq)

s_diffs['s1978'] = get_diff_locations(all_aln[4].seq,all_aln[5].seq)
s_diffs['s2009'] = get_diff_locations(all_aln[4].seq,all_aln[6].seq)
s_diffs['s2014'] = get_diff_locations(all_aln[4].seq,all_aln[7].seq)

Plot


In [5]:
def setup(diagram_name, ids, length):
    '''
    Inits empty track diagram (with diagram_name as title), tracks, and feature sets for all ids.
    Adds bright green bg to each track (for contrast).
    Returns tracks, features, and feature_sets.
    '''
    #Create Genome Diagram for human
    track_diagram = GenomeDiagram.Diagram(diagram_name+' H1N1 Tracks Plot', tracklines = 0)

    track = {}
    feature_set = {}

    #Generate tracks and feature sets for each year
    track_count = 1
    for i in ids: 
        track['%s' %i] = track_diagram.new_track(track_count, name=i, greytrack=False)#greytrack=True, 
                                                 #greytrack_labels=1, greytrack_fontcolor = colors.cornsilk,
                                                 #greytrack_fontsize=15)
        
        feature_set['%s'%i] = track['%s' %i].new_set(name=i)
        
        #add contrast background
        feature_set['%s'%i].add_feature(SeqFeature(FeatureLocation(0,length)), 
                                   label=True, label_angle=0, color=colors.cadetblue)
        track_count += 1
    return track_diagram, track, feature_set

h_diagram, h_features, h_feature_set = setup(diagram_name='Human', ids = h_ids, length=all_aln.get_alignment_length())
s_diagram, s_features, s_feature_set = setup(diagram_name='Swine', ids = s_ids, length=all_aln.get_alignment_length())

In [6]:
def add_features_year_compare(feature_set, diffs, hostspecies):
    for k, features in feature_set.items():
        year = k[1:]
        if '1935' in k:
            #if the base year...
            label = '%s: %s (reference)'%(hostspecies,year)

        else:
            #name bg feature (which labels the track)
            label ='%s: %s --- %i differences vs. 1935'%(hostspecies, year, len(diffs[k]))
            # For each difference recorded in diffs, add as a feature to feature_set
            for i in diffs[k]:
                feature = SeqFeature(FeatureLocation(i,i))
                features.add_feature(feature, name=label, color=colors.aqua)
                
        # set the name of the first feature (the background feature) to label
        # this is stupid, but I'm not sure how to do it better
        features.get_features()[0].name = label 

add_features_year_compare(feature_set = h_feature_set, diffs = h_diffs, hostspecies = 'Human')
add_features_year_compare(feature_set = s_feature_set, diffs = s_diffs, hostspecies = 'Swine')
plot_tracks(h_diagram,'h_tracks.png')
plot_tracks(s_diagram, 's_tracks.png')

Compare Strains by Host


In [56]:
diffs = {}
diffs['1935'] = get_diff_locations(all_aln[0].seq,all_aln[4].seq)
diffs['1978'] = get_diff_locations(all_aln[1].seq,all_aln[5].seq)
diffs['2009'] = get_diff_locations(all_aln[2].seq,all_aln[6].seq)
diffs['2014'] = get_diff_locations(all_aln[3].seq,all_aln[7].seq)

In [58]:
def compare_strains_by_host(diffs):
    #Create Genome Diagram to compare human and swine in each year
    diagram = GenomeDiagram.Diagram('Human vs. Swine H1N1 Tracks Plot')
    tracks = {}
    feature_set = {}
    for n,(k,v) in enumerate(diffs.iteritems(),1):
        tracks[k] = diagram.new_track(n, greytrack=False)
        feature_set[k] = tracks[k].new_set()
    return diagram, tracks, feature_set

def add_features_host_compare(feature_set):
    for k,v in feature_set.iteritems():
        feature_set[k].add_feature(SeqFeature(FeatureLocation(0,13378)), 
                                       name = 'Human vs. Swine: %s --- %i differences'%(k,len(diffs[k])), 
                                       label=True, label_angle=0, color=colors.cadetblue);
        for i in diffs[k]:
            feature = SeqFeature(FeatureLocation(i,i))
            feature_set[k].add_feature(feature, color=colors.aqua)

            
diagram, tracks, feature_set = compare_strains_by_host(diffs)
add_features_host_compare(feature_set)
feature_set
plot_tracks(diagram, 'host_compare.png')

STOP


In [27]:
#Import the aligned sequences
h_aln = AlignIO.read('all_human.aln','clustal')
h_aln.sort()
print h_aln
s_aln = AlignIO.read('all_swine.aln','clustal')
s_aln.sort()
print s_aln
#get_diff_locations


SingleLetterAlphabet() alignment with 4 rows and 13334 columns
AATATGGAAAGAATAAAAGAACTACGAAATCTAATGTCGCAGTC...--- h1935
--TATGGAAAGAATAAAAGAGCTAAGGAGTCTGATGTCGCAGTC...--- h1978
---ATGGAGAGAATAAAAGAACTGAGAGATCTAATGTCGCAGTC...--- h2009
---ATGGAGAGAATAAAAGAACTGAGAGATCTAATGTCGCAGTC...TAC h2014
SingleLetterAlphabet() alignment with 4 rows and 13360 columns
TCAAATATATTCAATATGGAGAGAATAAAAGAACTAAGGGATCT...--- s1935
TCAAATATATTCAATATGGAGAGAATAAAGGAACTAAGAAATCT...--- s1978
---------------ATGGAGAGAATAAAAGAACTAAGAGATCT...CAC s2009
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...--- s2014

In [ ]:


In [ ]:
for i,v in h_tracks.tracks.items():
    print v.get_sets()
print [i.name for i in h_tracks.tracks.values()]

track_diagram = GenomeDiagram.Diagram("")
track = track_diagram.new_track(1, greytrack=False)
print track.get_sets()
feature_set= track.new_set(name=i)
feature_set.get_features()